section .text
bits 64

; Win64 calling convention:
;	input parameters: rcx, rdx, r8, r9, [rsp+28h], [rsp+30h], ...
;	output parameter: rax
;	stack contains: [rsp] (8) return, [rsp+8h] (20h) "shadow space" to preserve registers
;	need to preserve registers: rbx, rsi, rdi, r12, r13, r14, r15, rbp
;	may destroy registers: rax, rcx, rdx, r8, r9, r10, r11

; Caller need to push parameters to the stack (if needed)
; and then allocate free space in stack of size 20h bytes (sub rsp,20h).

; Function must align stack to 16 bytes boundary (= do 1 push)
; and preserve registers rbx, rsi, rdi, r12, r13, r14, r15, rbp

; Note: Functions with short codes can be conditioned by SOFTASM flag.

; =============================================================================
;                          ADD uint128 (this += num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global AddU128A_x64

AddU128A_x64:
		mov		rax,[rdx]
		add		[rcx],rax
		mov		rax,[rdx+8]
		adc		[rcx+8],rax
		ret

; =============================================================================
;                          ADD uint128 (this += 0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 num

global AddU128B_x64

AddU128B_x64:
		add		[rcx],rdx
		adc		qword [rcx+8],0
		ret

; =============================================================================
;                          ADD uint128 (this = num1 + num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global AddU128C_x64

AddU128C_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		add		rax,[r8]
		adc		r9,[r8+8]
		mov		[rcx],rax
		mov		[rcx+8],r9
		ret

; =============================================================================
;                          ADD uint128 (this = num1 + 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2

global AddU128D_x64

AddU128D_x64:
		mov		rax,[rdx]
		mov		r9,[rdx+8]
		add		rax,r8
		adc		r9,byte 0
		mov		[rcx],rax
		mov		[rcx+8],r9
		ret

; =============================================================================
;                          SUB uint128 (this -= num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SubU128A_x64

SubU128A_x64:
		mov		rax,[rdx]
		sub		[rcx],rax
		mov		rax,[rdx+8]
		sbb		[rcx+8],rax
		ret

; =============================================================================
;                          SUB uint128 (this -= 0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 num

global SubU128B_x64

SubU128B_x64:
		sub		[rcx],rdx
		sbb		qword [rcx+8],0
		ret

; =============================================================================
;                          SUB uint128 (this = num1 - num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global SubU128C_x64

SubU128C_x64:
		mov		rax,[rdx]
		sub		rax,[r8]
		mov		[rcx],rax
		
		mov		rax,[rdx+8]
		sbb		rax,[r8+8]
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          SUB uint128 (this = num1 - 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2

global SubU128D_x64

SubU128D_x64:
		mov		rax,[rdx]
		sub		rax,r8
		mov		[rcx],rax
		
		mov		rax,[rdx+8]
		sbb		rax,0
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          SUB uint128 (this = 0:num1 - num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src1, r8=u64* src2

global SubU128E_x64

SubU128E_x64:
		mov		rax,rdx
		sub		rax,[r8]
		mov		[rcx],rax
		
		mov		rax,0
		sbb		rax,[r8+8]
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          SUB uint128 (this = num - this)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SubU128F_x64

SubU128F_x64:
		mov		rax,[rdx]
		sub		rax,[rcx]
		mov		[rcx],rax
		mov		rax,[rdx+8]
		sbb		rax,[rcx+8]
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          SUB uint128 (this = 0:num - this)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 num

global SubU128G_x64

SubU128G_x64:
		xor		rax,rax
		sub		rdx,[rcx]
		mov		[rcx],rdx
		sbb		rax,[rcx+8]
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          INC uint128 (this++)
; =============================================================================
; inputs: rcx=u64* dst

global IncU128A_x64

IncU128A_x64:
		add		qword [rcx],1
		adc		qword [rcx+8],0
		ret

; =============================================================================
;                          INC uint128 (this = num + 1)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global IncU128B_x64

IncU128B_x64:
		mov		rax,[rdx]
		add		rax,1
		mov		[rcx],rax
		
		mov		rax,[rdx+8]
		adc		rax,0
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          DEC uint128 (this--)
; =============================================================================
; inputs: rcx=u64* dst

global DecU128A_x64

DecU128A_x64:
		sub		qword [rcx],1
		sbb		qword [rcx+8],0
		ret

; =============================================================================
;                          DEC uint128 (this = num - 1)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global DecU128B_x64

DecU128B_x64:
		mov		rax,[rdx]
		sub		rax,1
		mov		[rcx],rax
		
		mov		rax,[rdx+8]
		sbb		rax,0
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          MUL uint128 (this = num1 * num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2

global MulU128A_x64

MulU128A_x64:

		mov		r11,[rdx+8]
		mov		r10,[rdx]
		mov		r9,[r8+8]
		mov		r8,[r8]

	; r11:r10 = num1
	; r9:r8 = num2

		mov		rax,r8
		mul		r11
		mov		r11,rax
		
		mov		rax,r10
		mul		r9
		add		r11,rax

		mov		rax,r8
		mul		r10
		add		rdx,r11

		mov		[rcx],rax
		mov		[rcx+8],rdx
		ret

; =============================================================================
;                          MUL uint128 (this = num1 * 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2

global MulU128B_x64

MulU128B_x64:
		mov		rax,[rdx]
		mov		r10,[rdx+8]
		mul		r8
		mov		[rcx],rax

		xchg	rdx,r10
		mov		rax,r8
		mul		rdx
		add		rax,r10
		mov		[rcx+8],rax
		ret

; =============================================================================
;                          MUL uint128 (this = 0:num1 * 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src1, r8=u64 src2

global MulU128C_x64

MulU128C_x64:
		mov		rax,rdx
		mul		r8
		mov		[rcx],rax
		mov		[rcx+8],rdx
		ret

; =============================================================================
;                  MUL uint128 with full range (this = num1 * num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=u64* high

global MulU128D_x64

MulU128D_x64:

; ----- push registers

		mov		[rsp+8],rbx

; ----- load num1 -> r11:r10

		mov		r11,[rdx+8]
		mov		r10,[rdx]

; ----- load num2 -> rbx:r8

		mov		rbx,[r8+8]
		mov		r8,[r8]

; ----- multiply num1.N0 * num2.N0 -> r8:[rcx]

		mov		rax,r8
		mul		r10
		mov		[rcx],rax
		xchg	rax,r8			; rax <- num2.N0
		xchg	rdx,r8			; r8 <- result HIGH
		
; ----- multiply num1.N1 * num2.N0 -> r10:r8:[rcx]

		mul		r11
		add		r8,rax
		adc		rdx,byte 0
		xchg	rdx,r10			; r10 <- result HIGH, rdx <- num1.N0
		
; ----- multiply num1.N0 * num2.N1 -> r8:r10:[rcx+8]:[rcx]
		
		mov		rax,rbx			; rax <- num2.N1
		mul		rdx				; rdx = num1.N0
		add		rax,r8
		mov		[rcx+8],rax
		adc		r10,rdx			; r10 <- result HIGH
		mov		r8,0
		adc		r8,r8			; r8 <- carry

; ----- multiply num1.N1 * num2.N1
		
		xchg	rax,rbx			; rax <- num2.N1
		mul		r11
		add		rax,r10
		mov		[r9],rax
		adc		r8,rdx
		mov		[r9+8],r8
		
; ----- pop registers

		mov		rbx,[rsp+8]
		ret

; =============================================================================
;                          MUL10 uint128 (this = num * 10)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src
; output: rax=carry

global MulU128E_x64

MulU128E_x64:
		mov		r8,rdx
		mov		r9,10
		
		mov		rax,[r8]
		mul		r9
		mov		[rcx],rax
		mov		r10,rdx

		mov		rax,[r8+8]
		mul		r9
		add		rax,r10
		adc		rdx,byte 0
		mov		[rcx+8],rax
		
		xchg	rax,rdx
		ret

; =============================================================================
;                          SQR uint128 (this = num * num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global SqrU128A_x64

SqrU128A_x64:
		mov		r8,[rdx]
		mov		r9,[rdx+8]
		
  		mov		rax,r8
		mul		rax
		mov		[rcx],rax
		
		mov		rax,r8
		xchg	rdx,r9
		
		mul		rdx
		add		r9,rax
		add		r9,rax
		
		mov		[rcx+8],r9
		ret

; =============================================================================
;                          SQR uint128 (this = 0:num * 0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src

global SqrU128B_x64

SqrU128B_x64:
		mov		rax,rdx
		mul		rax
		mov		[rcx],rax
		mov		[rcx+8],rdx
		ret

; =============================================================================
;                          DIV uint128 (this = num1 / num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64* src2, r9=u64* rem

global DivU128A_x64

DivU128A_x64:

; ----- push registers

		push	rbx
		push	rsi
		push	rdi
		push	r12
		push	r13
		push	r14
		push	r15

; ----- prepare dividend -> r15:r14:r13:r12 (0:0:num1_N1:num1_N0)

		mov		r12,[rdx]		; R12 <- num1_N0
		mov		r13,[rdx+8]		; R13 <- num1_N1
		xor		r14,r14			; R14 <- 0
		xor		r15,r15			; R15 <- 0

; ----- prepare divisor -> r11:r10 (num2_N1:num2_N0)

		mov		r10,[r8]		; R10 <- num2_N0
		mov		r11,[r8+8]		; R11 <- num2_N1
		
; ----- push destination RCX and remainder R9

		push	rcx
		push	r9
		
; ----- prepare accumulator -> RDI:RSI

		xor		rdi,rdi			; RDI <- 0
		xor		rsi,rsi			; RSI <- 0
		
; ----- prepare mask -> R9:R8

		xor		r8,r8			; R8 <- 0
		mov		r9,8000000000000000h ; R9 <- set bit 63
		
; ----- prepare to divide

		mov		rcx,128/2			; number of bits/2

DivU128A_x64_2:		

; ----- shift dividend left

		shl		r12,1
		rcl		r13,1
		rcl		r14,1
		rcl		r15,1
		
; ----- compare dividend with divisor

		cmp		r15,r11
		ja		DivU128A_x64_3
		jb		DivU128A_x64_4
		cmp		r14,r10
		jb		DivU128A_x64_4
		
; ----- sub divisor from dividend		
		
DivU128A_x64_3:

		sub		r14,r10
		sbb		r15,r11

; ----- add mask to accumulator

		or		rsi,r8
		or		rdi,r9

; ----- shift mask right

DivU128A_x64_4:

		shr		r9,1
		rcr		r8,1
		
; ----- shift dividend left

		shl		r12,1
		rcl		r13,1
		rcl		r14,1
		rcl		r15,1
		
; ----- compare dividend with divisor

		cmp		r15,r11
		ja		DivU128A_x64_5
		jb		DivU128A_x64_6
		cmp		r14,r10
		jb		DivU128A_x64_6
		
; ----- sub divisor from dividend		
		
DivU128A_x64_5:

		sub		r14,r10
		sbb		r15,r11

; ----- add mask to accumulator

		or		rsi,r8
		or		rdi,r9

; ----- shift mask right

DivU128A_x64_6:

		shr		r9,1
		rcr		r8,1
		
; ----- next loop (do not use "loop" instruction, it may be too slow)

		dec		rcx
		jnz		DivU128A_x64_2

; ----- pop destination RCX and remainder R9

		pop		r9
		pop		rcx
		
; ----- save result
		
		mov		[rcx],rsi
		mov		[rcx+8],rdi

; ----- save remainder

		test	r9,r9
		jz		DivU128A_x64_9
		mov		[r9],r14
		mov		[r9+8],r15		

; ----- pop registers

DivU128A_x64_9:

		pop		r15
		pop		r14
		pop		r13
		pop		r12
		pop		rdi
		pop		rsi
		pop		rbx
		ret

; =============================================================================
;                          DIV uint128 (this = num1 / 0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u64 src2
; output: rax=remainder

global DivU128B_x64

DivU128B_x64:
		mov		rax,[rdx+8]
		mov		r9,[rdx]
		xor		rdx,rdx
		div		r8
		mov		[rcx+8],rax
		
		mov		rax,r9
		div		r8
		mov		[rcx],rax
		
		mov		rax,rdx
		ret

; =============================================================================
;                          DIV uint128 (this = num1 / 0:0:0:num2)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1, r8=u32 src2
; output: eax=remainder

global DivU128C_x64

DivU128C_x64:
		mov		r10,[rdx]
		mov		r11,[rdx+8]
		xor		edx,edx
		
		mov		rax,r11
		shr		rax,32
		div		r8d
		xchg	rax,r11
		shl		r11,32
		div		r8d
		or		rax,r11
		mov		[rcx+8],rax
		
		mov		rax,r10
		shr		rax,32
		div		r8d
		xchg	rax,r10
		shl		r10,32
		div		r8d
		or		rax,r10
		mov		[rcx],rax
		
		mov		eax,edx
		ret

; =============================================================================
;     DIV uint128 (this = num1H:num1L / num2), with full range of dividend
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src1L, r8=u64* src1H, r9=u64* src2, [rsp+28h]=u64* rem

global DivU128D_x64

DivU128D_x64:

; ----- push registers (7 registers, shift offset in stack by 38h)

		push	rbx
		push	rsi
		push	rdi
		push	r12
		push	r13
		push	r14
		push	r15

; ----- prepare dividend -> r15:r14:r13:r12 (num1H_N1:num1H_N0:num1L_N1:num1L_N0)

		mov		r12,[rdx]		; R12 <- num1L_N0
		mov		r13,[rdx+8]		; R13 <- num1L_N1
		mov		r14,[r8]		; R14 <- num1H_N0
		mov		r15,[r8+8]		; R15 <- num1H_N1

; ----- prepare divisor -> r11:r10 (num2_N1:num2_N0)

		mov		r10,[r9]		; R10 <- num2_N0
		mov		r11,[r9+8]		; R11 <- num2_N1
		
; ----- push destination RCX

		push	rcx
		
; ----- prepare accumulator -> RDI:RSI

		xor		rdi,rdi			; RDI <- 0
		xor		rsi,rsi			; RSI <- 0
		
; ----- prepare mask -> R9:R8

		xor		r8,r8			; R8 <- 0
		mov		r9,8000000000000000h ; R9 <- set bit 63
		
; ----- prepare to divide

		mov		rcx,128/2			; number of bits/2

DivU128D_x64_2:		

; ----- shift dividend left

		shl		r12,1
		rcl		r13,1
		rcl		r14,1
		rcl		r15,1
		jc		DivU128D_x64_3
		
; ----- compare dividend with divisor

		cmp		r15,r11
		ja		DivU128D_x64_3
		jb		DivU128D_x64_4
		cmp		r14,r10
		jb		DivU128D_x64_4
		
; ----- sub divisor from dividend		
		
DivU128D_x64_3:

		sub		r14,r10
		sbb		r15,r11

; ----- add mask to accumulator

		or		rsi,r8
		or		rdi,r9

; ----- shift mask right

DivU128D_x64_4:

		shr		r9,1
		rcr		r8,1
		
; ----- shift dividend left

		shl		r12,1
		rcl		r13,1
		rcl		r14,1
		rcl		r15,1
		jc		DivU128D_x64_5
		
; ----- compare dividend with divisor

		cmp		r15,r11
		ja		DivU128D_x64_5
		jb		DivU128D_x64_6
		cmp		r14,r10
		jb		DivU128D_x64_6
		
; ----- sub divisor from dividend		
		
DivU128D_x64_5:

		sub		r14,r10
		sbb		r15,r11

; ----- add mask to accumulator

		or		rsi,r8
		or		rdi,r9

; ----- shift mask right

DivU128D_x64_6:

		shr		r9,1
		rcr		r8,1
		
; ----- next loop (do not use "loop" instruction, it may be too slow)

		dec		rcx
		jnz		DivU128D_x64_2

; ----- pop destination RCX

		pop		rcx
		
; ----- save result
		
		mov		[rcx],rsi
		mov		[rcx+8],rdi

; ----- save remainder

		mov		r9,[rsp+60h]
		test	r9,r9
		jz		DivU128D_x64_9
		mov		[r9],r14
		mov		[r9+8],r15		

; ----- pop registers

DivU128D_x64_9:

		pop		r15
		pop		r14
		pop		r13
		pop		r12
		pop		rdi
		pop		rsi
		pop		rbx
		ret

; =============================================================================
;                          DIV10 uint128 (this = num / 10)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src
; output: eax=remainder

global DivU128E_x64

DivU128E_x64:

		mov		r11,[rdx+8]	; r11 = N1
		mov		r10,[rdx]	; r10 = N0
		mov		r8,0cccccccccccccccdh ; r8 = 1/10

		; N1/10 -> rdx
		mov		rax,r8		; rax <- 1/10
		mul		r11			; mul by N1
		shr		rdx,3
		mov		[rcx+8],rdx	; result N1

		; remainder N1 % 10 -> r11
		lea		rax,[rdx+rdx*4] ; rax <- result N1 * 5
		add		rax,rax		; rax <- result N1 * 10
		sub		r11,rax		; r11 <- remainder 0..9

		; N0H/10 -> r11, rdx
		mov		rax,r10		; rax <- N0
		shr		rax,32		; rax <- N0H
		shl		r11,32		; r11 <- shift remainder to high position
		or		rax,r11		; rax <- remainder:N0H
		mul		r8
		shr		rdx,3		; rdx <- result N0H/10
		mov		r11,rdx
		shl		r11,32		; r11 <- save result into high position

		; remainder N0H % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N0H * 5
		add		rax,rax		; rax <- result N0H * 10
		mov		rdx,r10		; rdx <- N0
		shr		rdx,32		; rdx <- N0H
		sub		rdx,rax		; rdx <- remainder
		
		; N0L/10 -> rdx, r11
		mov		eax,r10d	; rax <- N0L
		shl		rdx,32		; rdx <- shift remainder to high position
		or		rax,rdx		; rax <- remainder:N0L
		mul		r8
		shr		rdx,3		; rdx <- result N0L/10
		or		r11,rdx		; r11 <- result of N0
		mov		[rcx],r11	; save result N0

		; remainder N0L % 10 -> rdx
		lea		rax,[rdx+rdx*4] ; rax <- result N0L * 5
		add		rax,rax		; rax <- result N0L * 10
		mov		edx,r10d	; rdx <- N0L
		sub		edx,eax		; rdx <- remainder

		mov		eax,edx
		ret

; =============================================================================
;                          NEG uint128 (this = -this)
; =============================================================================
; inputs: rcx=u64* dst

global NegU128A_x64

NegU128A_x64:
		xor		rax,rax
		xor		r9,r9
		sub		rax,[rcx]
		sbb		r9,[rcx+8]
		mov		[rcx],rax
		mov		[rcx+8],r9
		ret

; =============================================================================
;                          NEG uint128 (this = -num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64* src

global NegU128B_x64

NegU128B_x64:
		xor		rax,rax
		xor		r9,r9
		sub		rax,[rdx]
		sbb		r9,[rdx+8]
		mov		[rcx],rax
		mov		[rcx+8],r9
		ret

; =============================================================================
;                          NEG uint128 (this = -0:num)
; =============================================================================
; inputs: rcx=u64* dst, rdx=u64 src

global NegU128C_x64

NegU128C_x64:
		xor		rax,rax
		neg		rdx
		sbb		rax,rax
		mov		[rcx],rdx
		mov		[rcx+8],rax
		ret

; =============================================================================
;                              BITS uint128
; =============================================================================
; inputs: rcx=u64* dst
; output: rax=bits

global BitsU128_x64

BitsU128_x64:
		bsr		rax,qword [rcx+8]
		jz		BitsU128_x64_2
		lea		rax,[rax+64+1]
		ret		
		
BitsU128_x64_2:
		bsr		rax,qword [rcx]
		jz		BitsU128_x64_3
		lea		rax,[rax+1]
		ret
		
BitsU128_x64_3:
		xor		rax,rax
		ret	
